0) data dic
#check out this Kaggle: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
1) split data
a = read_csv('house_prices.csv') %>%
janitor::clean_names() %>%
mutate_if(is.character, factor) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(lubridate::is.Date),
where(is.factor),
where(is.numeric)
) %>% slice_sample(prop = 0.333) #limit data for speed
split = a %>% initial_split(strata = sale_price)
train = split %>% training
test = split %>% testing
vfold = train %>% vfold_cv(v = 5) #five folds for speed
check miss, inf, zeroes


a %>% funModeling::status() %>% arrange(-p_na)
#---------------------------------------------------
# a few cols have 80%+ missing values; imo not worth imputing, so I will discard those cols
a = a %>% select(-c(pool_qc, misc_feature, alley, fence))
viz distribution of outcome/y
a %>% ggplot(aes(sale_price)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a %>% ggplot(aes(sale_price)) + geom_histogram() + scale_x_log10() # def better to log transform outcome/y 'sale_price'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2) create specs for rec, model, workflow
#---------------------------------------------------rec
rec = train %>% recipe(sale_price ~ .) %>%
step_log(sale_price, base = 10) %>%
step_unknown(all_nominal(), - all_outcomes()) %>%
step_bagimpute(all_numeric(), -all_outcomes()) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_nzv(all_numeric(), -all_outcomes()) %>%
step_corr(all_numeric(), -all_outcomes()) %>%
#----------------------------
step_normalize(all_numeric(), - all_outcomes()) %>%
#step_pca(all_numeric(), num_comp = 3, - all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE)
rec %>% tidy
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 79
##
## Training data contained 366 data points and 366 incomplete rows.
##
## Operations:
##
## Log transformation on sale_price [trained]
## Unknown factor level assignment for alley, bldg_type, bsmt_cond, ... [trained]
## Bagged tree imputation for bedroom_abv_gr, bsmt_fin_sf1, ... [trained]
## Zero variance filter removed no terms [trained]
## Sparse, unbalanced variable filter removed bsmt_half_bath, ... [trained]
## Correlation filter removed no terms [trained]
## Centering and scaling for bedroom_abv_gr, bsmt_fin_sf1, ... [trained]
## Dummy variables from alley, bldg_type, bsmt_cond, ... [trained]
#check rec is working properly
baked.train = rec %>% prep %>% bake(new_data = NULL)
#baked.train %>% funModeling::status()
baked.train %>% head
#----------------------------
baked.test = rec %>% prep %>% bake(new_data = test)
#baked.test %>% funModeling::status()
baked.test %>% head
#---------------------------------------------------.mdl
#a %>% usemodels::use_xgboost(sale_price ~ .)
#Ref:https://parsnip.tidymodels.org/reference/boost_tree.html
mdl = parsnip::boost_tree(
trees = tune(),
min_n = tune(),
#mtry = tune(),
tree_depth = tune(),
#---gbm hps
learn_rate = tune(),
loss_reduction = tune(),
sample_size = tune()
) %>%
set_mode('regression') %>%
set_engine('xgboost')
#learn_rate:
#Ref: https://bradleyboehmke.github.io/HOML/gbm.html#basic-gbm
# This hyperparameter is also called SHRINKAGE. Generally, the SMALLER this value, the more ACCURATE the model can be but also will REQUIRE MORE TREES in the sequence.
# Determines the contribution of each tree on the final outcome and CONTROLS HOW QUICKLY the algorithm proceeds down the gradient descent (learns); see Figure 12.3. Values range from 0–1 with TYPICAL VALUES BETWEEN 0.001–0.3.
# SMALLER values make the model robust to the specific characteristics of each individual tree, thus allowing it to GENERALIZE well. Smaller values also make it easier to stop prior to overfitting; however, they increase the risk of not reaching the optimum with a fixed number of trees and are more computationally demanding.
#---------------------------------------------------wf
wf = workflow() %>%
add_recipe(rec) %>%
add_model(mdl)
beep(5)
3) execute tunegrid, finalize workflow
set.seed(345)
doParallel::registerDoParallel()
tg = tune_grid(
object = wf,
resamples = vfold,
grid = 5
)
#----------------------------
wf = wf %>% finalize_workflow(
parameters = tg %>% select_best
)
## Warning: No value of `metric` was given; metric 'rmse' will be used.
4) using wf, fit train, finalize model
doParallel::registerDoParallel()
(mdl = wf %>% fit(train) %>% pull_workflow_fit())
## parsnip model object
##
## Fit time: 8s
## ##### xgb.Booster
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = 0.0298693125601242, max_depth = 11L,
## gamma = 0.0574388244705595, colsample_bytree = 1, min_child_weight = 20L,
## subsample = 0.582812185315415), data = x$data, nrounds = 1606L,
## watchlist = x$watchlist, verbose = 0, objective = "reg:squarederror",
## nthread = 1)
## params (as set within xgb.train):
## eta = "0.0298693125601242", max_depth = "11", gamma = "0.0574388244705595", colsample_bytree = "1", min_child_weight = "20", subsample = "0.582812185315415", objective = "reg:squarederror", nthread = "1", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## callbacks:
## cb.evaluation.log()
## # of features: 324
## niter: 1606
## nfeatures : 324
## evaluation_log:
## iter training_rmse
## 1 4.585
## 2 4.448
## ---
## 1605 0.055
## 1606 0.055
5) using wf, fit train, eval test, return results
doParallel::registerDoParallel()
jw.fin = wf %>% last_fit(split)
jw.fin$.metrics
## [[1]]
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.0705 Preprocessor1_Model1
## 2 rsq standard 0.847 Preprocessor1_Model1
## [[1]]
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: boost_tree()
##
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
##
## * step_log()
## * step_unknown()
## * step_bagimpute()
## * step_zv()
## * step_nzv()
## * step_corr()
## * step_normalize()
## * step_dummy()
##
## -- Model -----------------------------------------------------------------------
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = 0.0298693125601242, max_depth = 11L,
## gamma = 0.0574388244705595, colsample_bytree = 1, min_child_weight = 20L,
## subsample = 0.582812185315415), data = x$data, nrounds = 1606L,
## watchlist = x$watchlist, verbose = 0, objective = "reg:squarederror",
## nthread = 1)
## params (as set within xgb.train):
## eta = "0.0298693125601242", max_depth = "11", gamma = "0.0574388244705595", colsample_bytree = "1", min_child_weight = "20", subsample = "0.582812185315415", objective = "reg:squarederror", nthread = "1", validate_parameters = "TRUE"
## callbacks:
## cb.evaluation.log()
## # of features: 324
## niter: 1606
## nfeatures : 324
## evaluation_log:
## iter training_rmse
## 1 4.585
## 2 4.448
## ---
## 1605 0.055
## 1606 0.055
jw.fin %>% collect_predictions() %>%
select(.pred, sale_price) %>%
#if you want to see what things would like in terms of actual sales dollars (as opposed to log10-dollars)
# mutate(
# .pred = 10^ .pred,
# price = 10^ price
# ) %>%
yardstick::metric_set(rmse, rsq, mae, mape)(estimate = .pred, truth = sale_price)
plotly::ggplotly(
jw.fin %>% collect_predictions() %>%
select(.pred, sale_price) %>%
#if you want to see what things would like in terms of actual sales dollars (as opposed to log10-dollars)
# mutate(
# .pred = 10^ .pred,
# sale_price = 10^ sale_price
# ) %>%
ggplot(aes(x = .pred, y = sale_price)) +
geom_point(alpha = 0.5) +
geom_smooth(method = 'lm', linetype = 'longdash', se = FALSE) +
labs(
title = 'Test: Predicted Sale Prices vs Actuals',
x = 'predicted prices',
y = 'actual prices'
)
)
IML:SHAPforxgboost
References
#1: https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
#2: https://christophm.github.io/interpretable-ml-book/shap.html
0) create standard xgb mdl
X_train = baked.train %>% select(-sale_price) %>% as.matrix() #xgboost only takes a matrix for
y = baked.train$sale_price #aka the 'label' in xgboost() arguments
# hyperparameter tuning results from xgb mdl above; will use these hps below
(best.hps = tg %>% select_best())
## Warning: No value of `metric` was given; metric 'rmse' will be used.
# Ref:https://xgboost.readthedocs.io/en/latest/parameter.html
# Ref:https://parsnip.tidymodels.org/reference/boost_tree.html#arguments #PARAMETER TRANSLATIONS
mdl.xgb = xgboost::xgboost(
data = X_train,
label = y,
#----------------------------
max_depth = best.hps$tree_depth,
nrounds = best.hps$trees,
eta = best.hps$learn_rate,
min_child_weight = best.hps$min_n,
gamma = best.hps$loss_reduction,
#colsample_bytree = 0.427692307692308, #mtry; this value is expressed as a ratio; you can find this value by executing ''mdl' and checking the 'call' details
subsample = best.hps$sample_size,
#----------------------------
nthread = parallel::detectCores() - 2,
verbose = FALSE,
objective = 'reg:squarederror'
)
1) Local SHAP shap.values
### 0) create shap.values object (shows SHAP vals for EVERY row in train)
shap.vals = shap.values(
xgb_model = mdl.xgb,
X_train = X_train
)
### 1> check out aggregate mean shap scores for top 10 ftrs
tibble(
features = shap.vals$mean_shap_score %>% sort(decreasing = TRUE) %>% head(10) %>% names,
SHAP = shap.vals$mean_shap_score %>% sort(decreasing = TRUE) %>% head(10)
)
### 2> visualize 1st observation's ftrs vs their respective SHAP vals
shap.vals$shap_score %>% dplyr::slice(1) %>%
pivot_longer(everything()) %>%
filter(value != 0) %>% #remove features with a 0 SHAP val
mutate(name = fct_reorder(name, value)) %>%
plotly::plot_ly(
x = ~value,
y = ~name,
color = ~if_else(value < 0, I('darkred'), I('darkgreen'))
) %>% layout(
title = 'Observation 1: Shap Values of Each Feature',
xaxis = list(title = paste0(
'Cumulative Total SHAP Values: ',
round(shap.vals$shap_score %>% dplyr::slice(1) %>% rowSums(), 2)
)),
yaxis = list(title = '')
)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning in if (is.na(col)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(col)) {: the condition has length > 1 and only the first
## element will be used
## Warning: `error_y.color` does not currently support multiple values.
## Warning: `error_x.color` does not currently support multiple values.
### 3> 1st observation's features vals
baked.train[1,]
2) Global SHAP for shap.plot.summary.wrap1
#https://www.rdocumentation.org/packages/SHAPforxgboost/versions/0.0.4/topics/shap.plot.summary.wrap1
ggplotly(
shap.plot.summary.wrap1(
mdl.xgb,
X_train,
top_n = 10
##dilute = 10 #to use only 1/10th of data
) + labs(
title = 'Global SHAP feature importance'
))
2.5) create ‘long’ shap.values object shap.prep
# To prepare the long-format data:
shap.long = shap.prep(xgb_model = mdl.xgb, X_train = X_train)
3) dependence plot shap.plot.dependence
fig.list = lapply(names(shap.vals$mean_shap_score)[1:4],
shap.plot.dependence, data_long = shap.long)
gridExtra::grid.arrange(grobs = fig.list, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

4) interaction plot shap.plot.dependence & shap.interaction
#NOTE: takes time to run!
#SHAP interaction values can be interpreted as the difference between the SHAP values for feature i when feature j is present and the SHAP values for feature i when feature j is absent.
shap.interaction = shap.prep.interaction(
xgb_mod = mdl.xgb,
X_train = X_train
)
shap.plot.dependence(data_long = shap.long, data_int = shap.interaction, x = 'total_bsmt_sf', y = 'x1st_flr_sf')
## `geom_smooth()` using formula 'y ~ x'

5) force plot shap.prep.stack.data
force.plot.data = shap.prep.stack.data(
shap_contrib = shap.vals$shap_score,
top_n = 3,
n_groups = 3)
## The SHAP values of the Rest 321 features were summed into variable 'rest_variables'.
shap.plot.force_plot(force.plot.data)
## Data has N = 366 | zoom in length is 50 at location 219.6.

shap.plot.force_plot_bygroup(force.plot.data)

dependence plot
understand interaction plot